Phylogenetic analysis based on identity

Load generic libraries

source('configuration.r')
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

Load plot specific libraries

library(ape)
library(ggtree)
## ggtree v1.15.6  For help: https://guangchuangyu.github.io/software/ggtree
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## - Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
## 
## - Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, accepted. doi: 10.1093/molbev/msy194
## 
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
## 
##     rotate
## The following object is masked from 'package:tidyr':
## 
##     expand
library(HiveR)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
## Warning in fun(libname, pkgname): couldn't connect to display ":0"

Merge data

df <- read.table('../tables/species_in_assembly_qc_pass.dat', head=TRUE, stringsAsFactors = FALSE, comment.char = '!')
meta.illumina <- read.table('../metadata/illumina_metadata.txt', head=TRUE)[,-2]
meta.nanopore <- read.table('../metadata/nanopore.metadata.txt', head=TRUE, sep='\t', strip.white = TRUE)
meta.merged <- merge(meta.nanopore, meta.illumina,
                     by.x='Illumina_Library_ID',
                     by.y='Library')
meta.merged <- merge(df, meta.merged,
                     by.x='lib',
                     by.y='Nanopore_ID')[,c(4,1:3,5:18)]
colnames(meta.merged)[3] <- 'Species'
write.table(meta.merged, '../output_tables/merged_assembly_metadata.tsv', quote=F, row.names = F, col.names = T, sep='\t')

Function to plot the tree with heatmap

## function to plot tree
plot.tree <- function(x, color, offset=0.01, title=NULL){
  dist.dat <- read.table(x)
  #idx <- str_detect(rownames(dist.dat), 's_') & rownames(dist.dat) %in% meta.merged[,1]
  idx <- rownames(dist.dat) %in% (filter(meta.merged, genome_quality=='HIGH_QUAL'))$genomeID | ##High quality genomes
    str_detect(rownames(dist.dat), 'G') # use ['G|F'] to include the new data 
  dist.dat <- dist.dat[idx, idx]
  
  ## clustering
  cluster.full <- hclust(as.dist(dist.dat), method='single')
  clusters <- cutree(cluster.full, h=0.001)
  ## filter patient data
  clusters <- clusters[str_detect(names(clusters), 's_')]
  
  strains <- sapply(unique(clusters), function(x) clusters[clusters==x][1])
  cluster.strains <- hclust(as.dist(dist.dat[names(strains), names(strains)]), method='single')
  
  merged <- merge(data.frame(clusters), meta.merged, by.x=0, by.y=1) %>%  
    mutate(strain=paste0('s', clusters))
  
  antibiotics <- select(merged, clusters, Antibiotics) %>% group_by(clusters, Antibiotics) %>% 
    count() %>% spread(Antibiotics,n,fill=0) %>% merge(data.frame(strains, id=names(strains)), by.x=1, by.y=1) %>% 
    select(-clusters, -BHI) %>% data.frame(row.names = 'id')
  antibiotics[antibiotics > 0] <- 'D'
  
  p <- ggtree(as.phylo(cluster.strains), layout="fan", open.angle=45, lwd=1.5) %<+%
    merged + 
    geom_tippoint(size=3, shape=19, col=color) +
    geom_tiplab2(aes(label=strain), size=5, offset=0.0015)
  
  p <- gheatmap(p, offset=offset, antibiotics, color='black', colnames_offset_y = 0.3,##colnames_offset_x=10,
                colnames_angle = 60, hjust =1, font.size=6.5) + scale_fill_manual(values=c('white', color)) + 
    theme(legend.position ='none') + 
    ggtitle(title)
  return(list(p=p, dat=merged))
}

Generate tree plots

colors <- pal_npg('nrc')(8)
g1 <- plot.tree('../tables/Elizabethkingia_anophelis_mummer_heatmap.dat', colors[1], 0.004, 'Elizabethkingia anophelis')
g2 <- plot.tree('../tables/Staphylococcus_epidermidis_mummer_heatmap.dat', colors[2], 0.004, 'Staphylococcus epidermidis')
g3 <- plot.tree('../tables/Staphylococcus_aureus_mummer_heatmap.dat', colors[3], 0.005, 'Staphylococcus aureus')
g4 <- plot.tree('../tables/Acinetobacter_baumannii_mummer_heatmap.dat', colors[4], 0.007, 'Acinetobacter baumannii')
g5 <- plot.tree('../tables/Pseudomonas_aeruginosa_mummer_heatmap.dat', colors[5], 0.004, 'Pseudomonas aeruginosa')
g6 <- plot.tree('../tables/Enterococcus_faecium_mummer_heatmap.dat', colors[6], 0.0055, 'Enterococcus faecium')
g7 <- plot.tree('../tables/Enterococcus_faecalis_mummer_heatmap.dat', colors[7], 0.005, 'Enterococcus faecalis')
g8 <- plot.tree('../tables/Klebsiella_pneumoniae_mummer_heatmap.dat', colors[8], 0.006, 'Klebsiella pneumoniae')


write.table(rbind(g1$dat, g2$dat, g3$dat, g4$dat, g5$dat, g6$dat, g7$dat, g8$dat), 
            '../output_tables/strain_cluster_summary.tsv', sep='\t', row.names = F, col.names = T, quote=F)

Main figure part (S. aureus)

g3$p

Supplementary figure part

s1 <- cowplot::plot_grid(g2$p, g4$p, g5$p, g6$p, g7$p, g8$p, nrow=1)
s1

Hive plot for species distribution

Generate edge data

rbind(g2$dat, g3$dat, g4$dat, g5$dat, g6$dat, g7$dat, g8$dat) %>%
  group_by(Species, clusters, Sample_type, Room_type, #bed_number,
           timept, Sample_ID.y, Cubicle_room) %>%
  tally()  %>% ungroup() %>% 
  mutate(clusters=sprintf("%02d", clusters)) %>% 
  group_by(Species, clusters, Sample_type, Cubicle_room, Room_type, timept) %>% 
  count %>% select(-n) %>% 
  mutate(label1='Strain', label2='Site', label3='Room') %>% 
  unite(n1,c(label1, Species, clusters), sep='=', remove=F) %>% 
  unite(n2,c(label2, Sample_type), sep='=', remove=F) %>% 
  unite(n3,c(label3, Room_type, Cubicle_room), sep='=',remove=F) %>% 
  select(-label1, -label2, -label3) -> edge_data

## remove strains only occurred once at one place
edge_data <- filter(edge_data, n1%in% 
         (edge_data %>% group_by(n1) %>% count %>% filter(n>1))$n1)

Hive plot function

hiveplot <- function(species, col, silent=FALSE){ 
  edge_plot <- filter(edge_data ,Species==species)
  strain_col <- col
  
  rbind(data.frame(x1=edge_plot$n1, x2=edge_plot$n2, color=edge_plot$timept, stringsAsFactors = F) ,
             data.frame(x1=edge_plot$n1, x2=edge_plot$n3, color=edge_plot$timept, stringsAsFactors = F)
             ) %>% 
    group_by(x1, x2, color) %>% count() %>% 
    select(x1, x2, weight=n, color) %>% 
    arrange(desc(x1,x2)) -> e
  
  hive <- edge2HPD(data.frame(e[,1:3]))
  hive$nodes$axis <- as.integer(as.factor(str_split_fixed(hive$nodes$lab, '=', 2)[,1]))
  hive$nodes$tag <- str_split_fixed(hive$nodes$lab, '=', 3)[,1] 
  ## species color
  hive$nodes$color[ hive$nodes$tag == 'Strain'] <- strain_col
  ## site color
  colors <- sapply(c(pal_simpsons('springfield')(16)), adjustcolor, alpha.f=0.9)
  colormap <- data.frame(site=levels(edge_data$Sample_type), col=colors[1:9], row.names = 1, stringsAsFactors = F)
  site.id <- hive$nodes$tag=='Site'
  hive$nodes$color[site.id] <- colormap[str_split_fixed(hive$nodes$lab, '=', 2)[site.id, 2], ]
  ## room color
  colormap <-data.frame(room=unique(edge_data$Room_type), col=colors[c(13,15,16)], row.names = 1, stringsAsFactors = F)
  room.id <- hive$nodes$tag=='Room'
  hive$nodes$color[room.id] <- colormap[str_split_fixed(hive$nodes$lab[room.id], '=', 3)[,2], ]
  
  hive$nodes$size=2
  hive$edges$weight <- hive$edges$weight*3-1
  hive$edges$color <- ifelse(e$color<2, '#ff990055','#66ccff55')
  #hive$edges$color <- ifelse(e$color<2,'#66ccff77',  '#ff330077')
  tmp <- data.frame(node.lab=hive$nodes$lab, node.text=hive$nodes$lab, angle=0, radius=0, offset=0, hjust=1, vjust=0.5)
  mutate(tmp, node.text=str_replace(node.text, 'Strain=.*=', 'Strain=s')) %>% 
    separate(col=node.text, into=c('lab','node.text'), '=', extra='merge') %>% 
    mutate(node.text=str_replace_all(node.text, '_', ' ')) %>% 
    mutate(node.text=str_replace_all(node.text, '=', ': ')) %>% 
    mutate(offset=ifelse(lab=='Room' | lab=='Site' , -0.05, -0.03)) %>% 
    select(-lab) %>% 
  write.table('tmp_hive.txt', sep=',', quote=T, row.names = F, col.names = T)
  if(silent){return(hive)}
  plotHive(hive, ch=0.2, method='ranknorm', bkgnd='white', anNodes = 'tmp_hive.txt',
          anNode.gpar=gpar(cex=1.5))
}

Main figure part (S. aureus)

hiveplot('Staphylococcus_aureus', colors[3])

g3 <- plot.tree('../tables/Staphylococcus_aureus_mummer_heatmap.dat', colors[3], 0.005, 'Staphylococcus aureus')
g4 <- plot.tree('../tables/Acinetobacter_baumannii_mummer_heatmap.dat', colors[4], 0.007, 'Acinetobacter baumannii')
g5 <- plot.tree('../tables/Pseudomonas_aeruginosa_mummer_heatmap.dat', colors[5], 0.004, 'Pseudomonas aeruginosa')
g6 <- plot.tree('../tables/Enterococcus_faecium_mummer_heatmap.dat', colors[6], 0.0055, 'Enterococcus faecium')
g7 <- plot.tree('../tables/Enterococcus_faecalis_mummer_heatmap.dat', colors[7], 0.005, 'Enterococcus faecalis')
g8 <- plot.tree('../tables/Klebsiella_pneumoniae_mummer_heatmap.dat', colors[8], 0.006, 'Klebsiella pneumoniae')
aux <- function(){
  vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
  grid.newpage()
  pushViewport(viewport(layout = grid.layout(1, 6)))
  pushViewport(vplayout(1, 1)) # left plot
  h <- hiveplot('Staphylococcus_epidermidis', colors[2], silent = TRUE)
  plotHive(h, ch=0.2, method='ranknorm', bkgnd='white', anNodes = 'tmp_hive.txt', np=FALSE,anNode.gpar=gpar(cex=1.5))
  popViewport(2)
  pushViewport(vplayout(1, 2))
  h <- hiveplot('Acinetobacter_baumannii', colors[4], silent=TRUE)
  plotHive(h, ch=0.2, method='ranknorm', bkgnd='white', anNodes = 'tmp_hive.txt', np=FALSE,anNode.gpar=gpar(cex=1.5))
  
  popViewport(2)
  pushViewport(vplayout(1, 3))
  h <- hiveplot('Pseudomonas_aeruginosa', colors[5], silent=TRUE) ## only one node -- cannot use 'ranknorm'
  plotHive(h, ch=0.2, method='rank', bkgnd='white', anNodes = 'tmp_hive.txt', np=FALSE,anNode.gpar=gpar(cex=1.5))
  
  popViewport(2)
  pushViewport(vplayout(1, 4))
  h <- hiveplot('Enterococcus_faecium', colors[6], silent=TRUE)
  plotHive(h, ch=0.2, method='ranknorm', bkgnd='white', anNodes = 'tmp_hive.txt', np=FALSE,anNode.gpar=gpar(cex=1.5))
  
  popViewport(2)
  pushViewport(vplayout(1, 5))
  h <- hiveplot('Enterococcus_faecalis', colors[7], silent=TRUE)
  plotHive(h, ch=0.2, method='ranknorm', bkgnd='white', anNodes = 'tmp_hive.txt', np=FALSE,anNode.gpar=gpar(cex=1.5))
  
  popViewport(2)
  pushViewport(vplayout(1, 6))
  h <- hiveplot('Klebsiella_pneumoniae', colors[8], silent=TRUE) ## only one node -- cannot use 'ranknorm'
  plotHive(h, ch=0.2, method='rank', bkgnd='white', anNodes = 'tmp_hive.txt', np=FALSE,anNode.gpar=gpar(cex=1.5))
}
aux()

Save plots

ggsave('../plots/fig2d.tree_main.pdf', g3$p, width=10, height=10)
pdf('../plots/fig2d.hive_main.pdf', width = 10, height = 10)
hiveplot('Staphylococcus_aureus', colors[3])
dev.off()
## png 
##   2
ggsave('../plots/sup8.tree_sup.pdf', s1, width=48, height=9)
pdf('../plots/sup8.hive_sup.pdf', width = 48, height = 9)
aux()
dev.off()
## png 
##   2

Session informaton

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
## 
## Matrix products: default
## BLAS: /usr/lib64/R/lib/libRblas.so
## LAPACK: /usr/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] HiveR_0.3.42   ggtree_1.15.6  ape_5.2        ggsci_2.9     
##  [5] reshape2_1.4.3 stringr_1.3.0  tibble_2.0.1   tidyr_0.8.2   
##  [9] dplyr_0.8.0.1  gridExtra_2.3  ggplot2_3.1.0 
## 
## loaded via a namespace (and not attached):
##  [1] treeio_1.7.3            tidyselect_0.2.5       
##  [3] purrr_0.3.0             lattice_0.20-35        
##  [5] tcltk_3.4.4             colorspace_1.3-2       
##  [7] miniUI_0.1.1.1          htmltools_0.3.6        
##  [9] yaml_2.1.18             rlang_0.3.1            
## [11] manipulateWidget_0.10.0 pillar_1.3.1           
## [13] later_0.7.5             glue_1.3.0             
## [15] withr_2.1.2             RColorBrewer_1.1-2     
## [17] tkrgl_0.8               jpeg_0.1-8             
## [19] rvcheck_0.1.3           plyr_1.8.4             
## [21] munsell_0.5.0           gtable_0.2.0           
## [23] htmlwidgets_1.2         evaluate_0.10.1        
## [25] labeling_0.3            knitr_1.20             
## [27] httpuv_1.4.5            crosstalk_1.0.0        
## [29] parallel_3.4.4          Rcpp_1.0.0             
## [31] xtable_1.8-3            scales_1.0.0           
## [33] backports_1.1.2         promises_1.0.1         
## [35] webshot_0.5.1           jsonlite_1.6           
## [37] mime_0.5                png_0.1-7              
## [39] digest_0.6.18           stringi_1.3.1          
## [41] shiny_1.2.0             cowplot_0.9.2          
## [43] rprojroot_1.3-2         tools_3.4.4            
## [45] rgl_0.99.16             magrittr_1.5           
## [47] lazyeval_0.2.1          crayon_1.3.4           
## [49] pkgconfig_2.0.2         tidytree_0.2.3         
## [51] assertthat_0.2.0        rmarkdown_1.9          
## [53] R6_2.4.0                nlme_3.1-131.1         
## [55] compiler_3.4.4